{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Diabetes Data Set特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入必要的工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                           148              72   \n",
       "1          1                            85              66   \n",
       "2          8                           183              64   \n",
       "3          1                            89              66   \n",
       "4          0                           137              40   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \n",
       "0                       0.627   50       1  \n",
       "1                       0.351   31       0  \n",
       "2                       0.672   32       1  \n",
       "3                       0.167   21       0  \n",
       "4                       2.288   33       1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读入数据\n",
    "diabetes = pd.read_csv('pima-indians-diabetes.csv')\n",
    "diabetes.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 768 entries, 0 to 767\n",
      "Data columns (total 9 columns):\n",
      "pregnants                       768 non-null int64\n",
      "Plasma_glucose_concentration    768 non-null int64\n",
      "blood_pressure                  768 non-null int64\n",
      "Triceps_skin_fold_thickness     768 non-null int64\n",
      "serum_insulin                   768 non-null int64\n",
      "BMI                             768 non-null float64\n",
      "Diabetes_pedigree_function      768 non-null float64\n",
      "Age                             768 non-null int64\n",
      "Target                          768 non-null int64\n",
      "dtypes: float64(2), int64(7)\n",
      "memory usage: 54.1 KB\n"
     ]
    }
   ],
   "source": [
    "diabetes.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "表面上看数据没有缺失值，但实际上肯定有缺失值，只是被标记为0了。比如BMI和舒张压两列中的0作为指标数值来说毫无意义。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.845052</td>\n",
       "      <td>120.894531</td>\n",
       "      <td>69.105469</td>\n",
       "      <td>20.536458</td>\n",
       "      <td>79.799479</td>\n",
       "      <td>31.992578</td>\n",
       "      <td>0.471876</td>\n",
       "      <td>33.240885</td>\n",
       "      <td>0.348958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.369578</td>\n",
       "      <td>31.972618</td>\n",
       "      <td>19.355807</td>\n",
       "      <td>15.952218</td>\n",
       "      <td>115.244002</td>\n",
       "      <td>7.884160</td>\n",
       "      <td>0.331329</td>\n",
       "      <td>11.760232</td>\n",
       "      <td>0.476951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>62.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>27.300000</td>\n",
       "      <td>0.243750</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>117.000000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>30.500000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>0.372500</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "      <td>140.250000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>127.250000</td>\n",
       "      <td>36.600000</td>\n",
       "      <td>0.626250</td>\n",
       "      <td>41.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>17.000000</td>\n",
       "      <td>199.000000</td>\n",
       "      <td>122.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>846.000000</td>\n",
       "      <td>67.100000</td>\n",
       "      <td>2.420000</td>\n",
       "      <td>81.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "count  768.000000                    768.000000      768.000000   \n",
       "mean     3.845052                    120.894531       69.105469   \n",
       "std      3.369578                     31.972618       19.355807   \n",
       "min      0.000000                      0.000000        0.000000   \n",
       "25%      1.000000                     99.000000       62.000000   \n",
       "50%      3.000000                    117.000000       72.000000   \n",
       "75%      6.000000                    140.250000       80.000000   \n",
       "max     17.000000                    199.000000      122.000000   \n",
       "\n",
       "       Triceps_skin_fold_thickness  serum_insulin         BMI  \\\n",
       "count                   768.000000     768.000000  768.000000   \n",
       "mean                     20.536458      79.799479   31.992578   \n",
       "std                      15.952218     115.244002    7.884160   \n",
       "min                       0.000000       0.000000    0.000000   \n",
       "25%                       0.000000       0.000000   27.300000   \n",
       "50%                      23.000000      30.500000   32.000000   \n",
       "75%                      32.000000     127.250000   36.600000   \n",
       "max                      99.000000     846.000000   67.100000   \n",
       "\n",
       "       Diabetes_pedigree_function         Age      Target  \n",
       "count                  768.000000  768.000000  768.000000  \n",
       "mean                     0.471876   33.240885    0.348958  \n",
       "std                      0.331329   11.760232    0.476951  \n",
       "min                      0.078000   21.000000    0.000000  \n",
       "25%                      0.243750   24.000000    0.000000  \n",
       "50%                      0.372500   29.000000    0.000000  \n",
       "75%                      0.626250   41.000000    1.000000  \n",
       "max                      2.420000   81.000000    1.000000  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diabetes.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从结果中我们可以看到很多列的最小值为0，而在一些特定列代表的变量中，0值并没有意义，这就表名该值无效或为缺失值。\n",
    "\n",
    "具体来说，下列变量的最小值为0时数据无意义： 1、血浆葡萄糖浓度 2、舒张压 3、肱三头肌皮褶厚度 4、餐后血清胰岛素 5、体重指数\n",
    "\n",
    "在Pandas的DataFrame中，通过replace()函数可以很方便的将我们感兴趣的数据子集的值标记为NaN。\n",
    "\n",
    "标记完缺失值之后，可以利用isnull()函数将数据集中所有的NaN值标记为True，然后就可以得到每一列中缺失值的数量了。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分开特征和标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#标签\n",
    "y_diabetes = diabetes['Target']\n",
    "\n",
    "X_diabetes = diabetes.drop(['Target'], axis = 1)\n",
    "#保存特征名字\n",
    "columns_org = X_diabetes.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.feat编码：log(x+1)\n",
    "原始特征feat_x看起来像计数特征，取log运算更接近人对数字的敏感度，更适合线性模型。 同时也可以降低长维分布中大数值的影响，减弱长维分布的长尾性。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants_log</th>\n",
       "      <th>Plasma_glucose_concentration_log</th>\n",
       "      <th>blood_pressure_log</th>\n",
       "      <th>Triceps_skin_fold_thickness_log</th>\n",
       "      <th>serum_insulin_log</th>\n",
       "      <th>BMI_log</th>\n",
       "      <th>Diabetes_pedigree_function_log</th>\n",
       "      <th>Age_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.945910</td>\n",
       "      <td>5.003946</td>\n",
       "      <td>4.290459</td>\n",
       "      <td>3.583519</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.543854</td>\n",
       "      <td>0.486738</td>\n",
       "      <td>3.931826</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.693147</td>\n",
       "      <td>4.454347</td>\n",
       "      <td>4.204693</td>\n",
       "      <td>3.401197</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.317816</td>\n",
       "      <td>0.300845</td>\n",
       "      <td>3.465736</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.197225</td>\n",
       "      <td>5.214936</td>\n",
       "      <td>4.174387</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.190476</td>\n",
       "      <td>0.514021</td>\n",
       "      <td>3.496508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.693147</td>\n",
       "      <td>4.499810</td>\n",
       "      <td>4.204693</td>\n",
       "      <td>3.178054</td>\n",
       "      <td>4.553877</td>\n",
       "      <td>3.370738</td>\n",
       "      <td>0.154436</td>\n",
       "      <td>3.091042</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>4.927254</td>\n",
       "      <td>3.713572</td>\n",
       "      <td>3.583519</td>\n",
       "      <td>5.129899</td>\n",
       "      <td>3.786460</td>\n",
       "      <td>1.190279</td>\n",
       "      <td>3.526361</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants_log  Plasma_glucose_concentration_log  blood_pressure_log  \\\n",
       "0       1.945910                          5.003946            4.290459   \n",
       "1       0.693147                          4.454347            4.204693   \n",
       "2       2.197225                          5.214936            4.174387   \n",
       "3       0.693147                          4.499810            4.204693   \n",
       "4       0.000000                          4.927254            3.713572   \n",
       "\n",
       "   Triceps_skin_fold_thickness_log  serum_insulin_log   BMI_log  \\\n",
       "0                         3.583519           0.000000  3.543854   \n",
       "1                         3.401197           0.000000  3.317816   \n",
       "2                         0.000000           0.000000  3.190476   \n",
       "3                         3.178054           4.553877  3.370738   \n",
       "4                         3.583519           5.129899  3.786460   \n",
       "\n",
       "   Diabetes_pedigree_function_log   Age_log  \n",
       "0                        0.486738  3.931826  \n",
       "1                        0.300845  3.465736  \n",
       "2                        0.514021  3.496508  \n",
       "3                        0.154436  3.091042  \n",
       "4                        1.190279  3.526361  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_log = np.log1p(X_diabetes)\n",
    "\n",
    "#重新组成DataFrame\n",
    "feat_names = columns_org + '_log'\n",
    "X_log = pd.DataFrame(columns = feat_names, data = X_log.values)\n",
    "\n",
    "X_log.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.feat编码： TF-IDF\n",
    "原始特征feat_x看起来像计数特征，类似文本分析中词频特征的处理，TF-IDF可以突出对特别类别有贡献的低频词。 这里原始特征已经是计数特征了，直接调用TfidfTransformer，将计数特征变成TF-IDF 如果输入是原始文本，需要将计数功能（TF）和IDF功能集中在一起，用TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants_tfidf</th>\n",
       "      <th>Plasma_glucose_concentration_tfidf</th>\n",
       "      <th>blood_pressure_tfidf</th>\n",
       "      <th>Triceps_skin_fold_thickness_tfidf</th>\n",
       "      <th>serum_insulin_tfidf</th>\n",
       "      <th>BMI_tfidf</th>\n",
       "      <th>Diabetes_pedigree_function_tfidf</th>\n",
       "      <th>Age_tfidf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.037717</td>\n",
       "      <td>0.810132</td>\n",
       "      <td>0.409804</td>\n",
       "      <td>0.256931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.185363</td>\n",
       "      <td>0.003410</td>\n",
       "      <td>0.271919</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.009341</td>\n",
       "      <td>0.691357</td>\n",
       "      <td>0.558183</td>\n",
       "      <td>0.316326</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.218049</td>\n",
       "      <td>0.002836</td>\n",
       "      <td>0.250508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.046188</td>\n",
       "      <td>0.920021</td>\n",
       "      <td>0.334562</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.118057</td>\n",
       "      <td>0.003357</td>\n",
       "      <td>0.159835</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.005813</td>\n",
       "      <td>0.450469</td>\n",
       "      <td>0.347351</td>\n",
       "      <td>0.156119</td>\n",
       "      <td>0.787603</td>\n",
       "      <td>0.143341</td>\n",
       "      <td>0.000840</td>\n",
       "      <td>0.105602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.426849</td>\n",
       "      <td>0.129587</td>\n",
       "      <td>0.146243</td>\n",
       "      <td>0.866498</td>\n",
       "      <td>0.135338</td>\n",
       "      <td>0.007082</td>\n",
       "      <td>0.102151</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants_tfidf  Plasma_glucose_concentration_tfidf  blood_pressure_tfidf  \\\n",
       "0         0.037717                            0.810132              0.409804   \n",
       "1         0.009341                            0.691357              0.558183   \n",
       "2         0.046188                            0.920021              0.334562   \n",
       "3         0.005813                            0.450469              0.347351   \n",
       "4         0.000000                            0.426849              0.129587   \n",
       "\n",
       "   Triceps_skin_fold_thickness_tfidf  serum_insulin_tfidf  BMI_tfidf  \\\n",
       "0                           0.256931             0.000000   0.185363   \n",
       "1                           0.316326             0.000000   0.218049   \n",
       "2                           0.000000             0.000000   0.118057   \n",
       "3                           0.156119             0.787603   0.143341   \n",
       "4                           0.146243             0.866498   0.135338   \n",
       "\n",
       "   Diabetes_pedigree_function_tfidf  Age_tfidf  \n",
       "0                          0.003410   0.271919  \n",
       "1                          0.002836   0.250508  \n",
       "2                          0.003357   0.159835  \n",
       "3                          0.000840   0.105602  \n",
       "4                          0.007082   0.102151  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform counts to TFIDF features\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "tfidf = TfidfTransformer()\n",
    "\n",
    "#输出稀疏矩阵\n",
    "X_tfidf = tfidf.fit_transform(X_diabetes).toarray()\n",
    "\n",
    "#重新组成DataFrame,为了可视化\n",
    "feat_names = columns_org + \"_tfidf\"\n",
    "X_tfidf = pd.DataFrame(columns = feat_names, data = X_tfidf)\n",
    "\n",
    "X_tfidf.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.其他特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pregnants                         0\n",
      "Plasma_glucose_concentration      5\n",
      "blood_pressure                   35\n",
      "Triceps_skin_fold_thickness     227\n",
      "serum_insulin                   374\n",
      "BMI                              11\n",
      "Diabetes_pedigree_function        0\n",
      "Age                               0\n",
      "Target                            0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "NaN_col_names = ['Plasma_glucose_concentration','blood_pressure','Triceps_skin_fold_thickness','serum_insulin','BMI']\n",
    "diabetes[NaN_col_names] = diabetes[NaN_col_names].replace(0, np.NaN)\n",
    "print(diabetes.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>Triceps_skin_fold_thickness_Missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>29.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>23.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>45.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Triceps_skin_fold_thickness  Triceps_skin_fold_thickness_Missing\n",
       "0                         35.0                                    0\n",
       "1                         29.0                                    0\n",
       "2                          NaN                                    1\n",
       "3                         23.0                                    0\n",
       "4                         35.0                                    0\n",
       "5                          NaN                                    1\n",
       "6                         32.0                                    0\n",
       "7                          NaN                                    1\n",
       "8                         45.0                                    0\n",
       "9                          NaN                                    1"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#缺失值比较多，新增一个新的字段，表明是缺失值还是不是缺失值\n",
    "diabetes['Triceps_skin_fold_thickness_Missing'] = diabetes['Triceps_skin_fold_thickness'].apply(lambda x: 1 if pd.isnull(x) else 0)\n",
    "diabetes[['Triceps_skin_fold_thickness','Triceps_skin_fold_thickness_Missing']].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0xacea9e8>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAELCAYAAADDZxFQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAGnpJREFUeJzt3XuQFeWd//H3Ry6CEUVgdJEBId42EJdBRzRGE2+/H4Ssikm0YJP1HkwVScwmsjGplCLRqiSbROOa6GK8YGJQ10RF10vUqKzZKCIiAi4lRn8yooKorC6iMn5/f/QzchyfGQ44PWdgPq+qU3P66ae7v+cMnM883X26FRGYmZm1tl2tCzAzs67JAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsq2etC/goBg0aFMOHD691GWZmW5XHHnvslYio21S/rToghg8fzvz582tdhpnZVkXS/6umn3cxmZlZlgPCzMyyHBBmZpa1VR+DMDOrlXfffZempibWr19f61La1KdPH+rr6+nVq9cWLe+AMDPbAk1NTfTr14/hw4cjqdblfEhEsGbNGpqamhgxYsQWrcO7mMzMtsD69esZOHBglwwHAEkMHDjwI41wHBBmZluoq4ZDi49anwPCzMyyfAzCzKwDrFmzhqOOOgqAl156iR49elBXV3xZed68efTu3bvDt7lgwQJWrVrF+PHjO3zd4IDggGnX1rqELuOxfzmp1iWYbbUGDhzIwoULAZg+fTo77rgjZ599dtXLNzc306NHj83a5oIFC1i8eHFpAeFdTGZmJTvmmGM44IADGDVqFL/+9a8B2LBhA/379+cHP/gBY8eOZd68ecyZM4d9992Xww47jG984xtMnDgRgDfffJNTTjmFsWPHMmbMGG677TbeeustZsyYwXXXXUdDQwM33XRTh9dd2ghCUh9gLrB92s5NEXGepGuAzwJrU9dTImKhiqMpvwAmAOtS+4Ky6jMz6yyzZs1iwIABrFu3jsbGRr74xS/Sr18/1q5dy/77788FF1zAunXr2Gefffjzn//MsGHDOPHEE99ffsaMGYwfP55rrrmG1157jYMOOohFixZx7rnnsnjxYi6++OJS6i5zBPE2cGREjAYagPGSDk7zpkVEQ3osTG2fA/ZOjynAZSXWZmbWaS666CJGjx7Npz71KZqamnjmmWcA6N27N8cffzwAS5cuZd9992WPPfZAEpMnT35/+T/+8Y9ceOGFNDQ0cMQRR7B+/Xqef/750usubQQREQG8mSZ7pUe0s8hxwLVpuYcl9Zc0OCJeLKtGM7Oy3XvvvcydO5eHH36Yvn37cuihh77/3YS+ffu+fypq8dGXFxHccsst7Lnnnh9onzt3bnmFU/IxCEk9JC0EVgH3RMQjadaFkhZJukjS9qltCLCiYvGm1GZmttVau3YtAwYMoG/fvixZsoRHH30022/UqFEsW7aMFStWEBHccMMN788bN24cl1xyyfvTjz/+OAD9+vXjjTfeKK32UgMiIpojogGoB8ZK+iTwPeBvgQOBAcB3U/fcNzo+FKmSpkiaL2n+6tWrS6rczKxjfP7zn2fdunWMHj2aGTNmcNBBB2X77bDDDlx66aUcffTRHHbYYey+++7svPPOAJx33nmsW7eO/fbbj1GjRjF9+nQAjjzySJ544gnGjBmzdR2krhQRr0t6ABgfET9NzW9LuhpoOQ+sCRhasVg9sDKzrpnATIDGxsb2dlmZmdVEywc4FBfMu/vuu7P9Xn/99Q9MH3300SxbtoyI4Mwzz6SxsRGAj33sY1xxxRUfWr6urq7Um6aVNoKQVCepf3reFzga+G9Jg1ObgInA4rTIHOAkFQ4G1vr4g5l1J5dddhkNDQ2MHDmSt956i69+9as1rafMEcRgYJakHhRBdGNE3C7pT5LqKHYpLQS+lvrfQXGK63KK01xPLbE2M7MuZ9q0aUybNq3WZbyvzLOYFgFjMu1HttE/gKll1WNmZpvH36Q2M7MsB4SZmWU5IMzMLKvbX83VzKwjdPSVoau5uvJdd93FWWedRXNzM2eccQbnnHNOh9bgEYSZ2VaoubmZqVOncuedd7J06VJmz57N0qVLO3QbDggzs63QvHnz2Guvvfj4xz9O7969mTRpErfeemuHbsMBYWa2FXrhhRcYOnTjxSfq6+t54YUXOnQbDggzs61Q7uqvLVeG7SgOCDOzrVB9fT0rVmy8AHZTUxO77757h27DAWFmthU68MADefrpp3n22Wd55513uP766zn22GM7dBs+zdXMrANUc1pqR+rZsyeXXnop48aNo7m5mdNOO41Ro0Z17DY6dG1mZtZpJkyYwIQJE0pbv3cxmZlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsy6e5mpl1gOdn7Neh6xt27pOb7HPaaadx++23s+uuu7J48eIO3T54BGFmttU65ZRTuOuuu0pbf2kBIamPpHmSnpC0RNL5qX2EpEckPS3pBkm9U/v2aXp5mj+8rNrMzLYFn/nMZxgwYEBp6y9zBPE2cGREjAYagPGSDgZ+DFwUEXsDrwGnp/6nA69FxF7ARamfmZnVSGkBEYU302Sv9AjgSOCm1D4LmJieH5emSfOPUkdfu9bMzKpW6jEIST0kLQRWAfcAzwCvR8SG1KUJGJKeDwFWAKT5a4GBZdZnZmZtKzUgIqI5IhqAemAs8Ilct/QzN1r40B0xJE2RNF/S/NWrV3dcsWZm9gGdcpprRLwu6QHgYKC/pJ5plFAPrEzdmoChQJOknsDOwKuZdc0EZgI0NjZ++JZKZmY1UM1pqR1t8uTJPPDAA7zyyivU19dz/vnnc/rpp296wSqVFhCS6oB3Uzj0BY6mOPB8P/Al4HrgZKDlLttz0vRf0vw/Re6eemZmBsDs2bNLXX+ZI4jBwCxJPSh2Zd0YEbdLWgpcL+kC4HHgytT/SuA3kpZTjBwmlVibmZltQmkBERGLgDGZ9r9SHI9o3b4eOKGseszMbPP4m9RmZluoq+8F/6j1OSDMzLZAnz59WLNmTZcNiYhgzZo19OnTZ4vX4Yv1mZltgfr6epqamujKp9v36dOH+vr6LV7eAWFmtgV69erFiBEjal1GqbyLyczMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZVmkBIWmopPslPSVpiaSzUvt0SS9IWpgeEyqW+Z6k5ZKWSRpXVm1mZrZpZd5RbgPwnYhYIKkf8Jike9K8iyLip5WdJY0EJgGjgN2BeyXtExHNJdZoZmZtKG0EEREvRsSC9PwN4ClgSDuLHAdcHxFvR8SzwHJgbFn1mZlZ+zrlGISk4cAY4JHU9HVJiyRdJWmX1DYEWFGxWBOZQJE0RdJ8SfO78s3Czcy2dqUHhKQdgd8D34qI/wEuA/YEGoAXgZ+1dM0sHh9qiJgZEY0R0VhXV1dS1WZmVmpASOpFEQ7XRcQfACLi5Yhojoj3gCvYuBupCRhasXg9sLLM+szMrG1lnsUk4ErgqYj4eUX74IpuxwOL0/M5wCRJ20saAewNzCurPjMza1+ZZzF9GvhH4ElJC1Pb94HJkhoodh89B5wJEBFLJN0ILKU4A2qqz2AyM6ud0gIiIh4if1zhjnaWuRC4sKyazMysev4mtZmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllVRUQku6rps3MzLYd7d5yVFIfYAdgkKRd2HgL0Z2A3UuuzczMamhTI4gzgceAv00/Wx63Ar9sb0FJQyXdL+kpSUsknZXaB0i6R9LT6ecuqV2SLpG0XNIiSft/1BdnZmZbrt2AiIhfRMQI4OyI+HhEjEiP0RFx6SbWvQH4TkR8AjgYmCppJHAOcF9E7A3cl6YBPgfsnR5TgMu2/GWZmdlH1e4uphYR8a+SDgGGVy4TEde2s8yLwIvp+RuSngKGAMcBh6dus4AHgO+m9msjIoCHJfWXNDitx8zMOllVASHpN8CewEKgOTUH0GZAtFp+ODAGeATYreVDPyJelLRr6jYEWFGxWFNqc0CYmdVAVQEBNAIj01/3m0XSjsDvgW9FxP9IarNrpu1D25M0hWIXFMOGDdvccszMrErVfg9iMfA3m7tySb0owuG6iPhDan5Z0uA0fzCwKrU3AUMrFq8HVrZeZ0TMjIjGiGisq6vb3JLMzKxK1QbEIGCppLslzWl5tLeAiqHClcBTEfHzillzgJPT85MpzohqaT8pnc10MLDWxx/MzGqn2l1M07dg3Z8G/hF4UtLC1PZ94EfAjZJOB54HTkjz7gAmAMuBdcCpW7BNMzPrINWexfTg5q44Ih4if1wB4KhM/wCmbu52zMysHNWexfQGGw8Y9wZ6Af8bETuVVZiZmdVWtSOIfpXTkiYCY0upyMzMuoQtupprRNwCHNnBtZiZWRdS7S6mL1RMbkfxvYjN/k6EmZltPao9i+mYiucbgOcoLo1hZmbbqGqPQfiUUzOzbqbaGwbVS7pZ0ipJL0v6vaT6soszM7PaqXYX09XA79j4pbavpLb/U0ZRVhvPz9iv1iV0GcPOfbLWJZjVXLVnMdVFxNURsSE9rgF8ISQzs21YtQHxiqSvSOqRHl8B1pRZmJmZ1Va1AXEacCLwEsX9Gb6Er5VkZrZNq/YYxA+BkyPiNSjuKw38lCI4zMxsG1TtCOLvWsIBICJepbhDnJmZbaOqDYjtJO3SMpFGENWOPszMbCtU7Yf8z4D/knQTxSU2TgQuLK0qMzOruWq/SX2tpPkUF+gT8IWIWFpqZWZmVlNV7yZKgeBQMDPrJrboct9mZrbtc0CYmVmWA8LMzLJKCwhJV6Wrvy6uaJsu6QVJC9NjQsW870laLmmZpHFl1WVmZtUpcwRxDTA+035RRDSkxx0AkkYCk4BRaZlfSepRYm1mZrYJpQVERMwFXq2y+3HA9RHxdkQ8CywHxpZVm5mZbVotjkF8XdKitAuq5dvZQ4AVFX2aUpuZmdVIZwfEZcCeQAPFVWF/ltqV6Ru5FUiaImm+pPmrV68up0ozM+vcgIiIlyOiOSLeA65g426kJmBoRdd6YGUb65gZEY0R0VhX53sWmZmVpVMDQtLgisnjgZYznOYAkyRtL2kEsDcwrzNrMzOzDyrtiqySZgOHA4MkNQHnAYdLaqDYffQccCZARCyRdCPFpTw2AFMjorms2szMbNNKC4iImJxpvrKd/hfiK8SamXUZ/ia1mZll+aY/Zl3UAdOurXUJXcZj/3JSrUvoljyCMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWWVFhCSrpK0StLiirYBku6R9HT6uUtql6RLJC2XtEjS/mXVZWZm1SlzBHENML5V2znAfRGxN3Bfmgb4HLB3ekwBLiuxLjMzq0JpARERc4FXWzUfB8xKz2cBEyvar43Cw0B/SYPLqs3MzDats49B7BYRLwKkn7um9iHAiop+TanNzMxqpKscpFamLbIdpSmS5kuav3r16pLLMjPrvjo7IF5u2XWUfq5K7U3A0Ip+9cDK3AoiYmZENEZEY11dXanFmpl1Z50dEHOAk9Pzk4FbK9pPSmczHQysbdkVZWZmtdGzrBVLmg0cDgyS1AScB/wIuFHS6cDzwAmp+x3ABGA5sA44tay6zMysOqUFRERMbmPWUZm+AUwtqxYzM9t8XeUgtZmZdTGljSDMzDrK8zP2q3UJXcawc5/stG15BGFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWTW5o5yk54A3gGZgQ0Q0ShoA3AAMB54DToyI12pRn5mZ1XYEcURENEREY5o+B7gvIvYG7kvTZmZWI11pF9NxwKz0fBYwsYa1mJl1e7UKiAD+KOkxSVNS224R8SJA+rlrjWozMzNqdAwC+HRErJS0K3CPpP+udsEUKFMAhg0bVlZ9ZmbdXk1GEBGxMv1cBdwMjAVeljQYIP1c1cayMyOiMSIa6+rqOqtkM7Nup9MDQtLHJPVreQ78X2AxMAc4OXU7Gbi1s2szM7ONarGLaTfgZkkt2/9dRNwl6VHgRkmnA88DJ9SgNjMzSzo9ICLir8DoTPsa4KjOrsfMzPK60mmuZmbWhTggzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy+pyASFpvKRlkpZLOqfW9ZiZdVddKiAk9QB+CXwOGAlMljSytlWZmXVPXSoggLHA8oj4a0S8A1wPHFfjmszMuqWuFhBDgBUV002pzczMOlnPWhfQijJt8YEO0hRgSpp8U9Ky0qvqJvaAQcArta6jSzgv90/RasX/Nit0zL/NParp1NUCogkYWjFdD6ys7BARM4GZnVlUdyFpfkQ01roOs9b8b7M2utoupkeBvSWNkNQbmATMqXFNZmbdUpcaQUTEBklfB+4GegBXRcSSGpdlZtYtdamAAIiIO4A7al1HN+Vdd9ZV+d9mDSgiNt3LzMy6na52DMLMzLoIB4T58ibWZUm6StIqSYtrXUt35IDo5nx5E+virgHG17qI7soBYb68iXVZETEXeLXWdXRXDgjz5U3MLMsBYZu8vImZdU8OCNvk5U3MrHtyQJgvb2JmWQ6Ibi4iNgAtlzd5CrjRlzexrkLSbOAvwL6SmiSdXuuauhN/k9rMzLI8gjAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDopuRNFDSwvR4SdILFdO9W/W9W1K/WtXamqSHJDVk2reoTkkjJT0h6XFJw9vo01PS623M+62kie2s/9uS+lSxnqmSvtzOeo6WdEt7r6UzSDpDUkj6bEXbCaltYpq+WtK+m7ne4yVN6+h67aPrcrcctXJFxBqgAUDSdODNiPhpZR9JoviOzLjOr3DzfYQ6vwDcFBE/7Mh6KnwbuApY316niPhlSdsvw5PAZODBND0JeKJlZkScurkrjIibO6Y062geQRgAkvaStFjS5cACYHD65mr/NP9USYvSX9xXp7bdJP1B0nxJ8yQdnNovkDRL0v2SnpZ0WmofkkYBC9O2Dmmjlp6SfiPpydTvm63m90h/vU9P002S+le8hislLZF0Z8tf8JltHEvxDfKvSbo3tf1zWn6xpG9kltlO0q8kLZV0GzConffzn4Bdgf9sWX9q/1F6D/8iadeK9+tb6fk+kv6U+ixoPbKRdFBLe1ruSkkPSvqrpKkV/U5Ov5OFqebt2npfJf1Tek1PSPptW68peQA4JK1rJ2AY8P7NfFpGeZuzrTQyuTg9/62kX0j6r/Sajk/tPSRdnn6vt0m6S+2M3qxjeARhlUYCp0bE1wCKgQRIGg18FzgkIl6VNCD1vwT4SUQ8nD7Ibgc+mebtBxwC7AQskPQfwFeA2yLixypuVNS3jToOAAZFxH5p+/0r5vUEfgcsiIgfZ5bdF5gcEU9K+gMwkeIeFx8QEXMkjQVeiYiL0/MvU9wfowcwT9KDwNKKxb4EjEivcfc07/LcC4iIiyR9BzgsIl6X1BPYGXgwIs6R9HPgNOBHrRadDUyPiNtSuG0H7JXeh8OAi4BjI6Ip/X72AY4C+gNPqQj4TwDHU/y+NkiaSfGX/jNtvK//DOwREe+0eq9z3qMIiaOB3YBb0vZaa+t3WM22dgU+TfFv6EbgZuAEisvQ7wf8DcVlYbLvvXUcjyCs0jMR8Wim/Ujghoh4FaDlJ8WHxOWSFlJ8UOwiqeVD/5aIWB8Rq4C5wIEUFwY8Q9J5wCcj4s026lhOce2dX0gaB6ytmHclbYcDFDc/ejI9fwwYvonX3OIw4PcRsS4i3kiv59BWfT4DzI6I9yKiieKDcnO8FRF3tlWbpF0oPlRvA0jv37o0+5PAr4C/T9tucXtEvJPe51eBOorfy4HA/PS7+SywJ22/r0uA36o4DvJuFa/jeorAmUQmfJOPsq1borCIjfcmOZTiOmHvRcRKNu7ishI5IKzS/7bRLvL3iBAwNiIa0mNIRLyV5rXuHxHxJ+Bw4EXgOrVxYDYdJ/k74CHgm8C/Vcz+M3CUpO3bqPXtiufNVD9Kzt0XI1telf1y3ql43lZtba1/ZVq+9UH63OsVcFXF72XfiPhhO+/rOIq/xsdShEqPTbyOvwD7AztFxDO5Dh9xW5WvSa1+WidyQFg17gUmtexaqtjFdC9Qud+78sNroqTtJQ2i+Ot8vqQ9gJciYibFvYbH5DYmqY7iIPm/A+dRfBi1mJm2e33abdNR5gLHS+oraUeK267+Z6bPpLQ/fwjFX+bteQOo+uyqiHgNeEXSMQCS+kjaIc1+Ffh74CdpV1N77gVOTO99y5lrw3Lva/qArk/hPY1iBLJDWytOdQbwPeD7bfXpqG1VeAj4kgqDKUZzVjIfg7BNiohFkn4CzJW0gWL3yOkU4XCZpFMp/i3dz8bAeBS4k+JmROdFxMsqDlZ/W9K7wJsUxyRyhgJXqtjJHhTHPyrr+YmkC4FrJJ3UQa9xnopLS7fsYrssHceo/D9yE3AExUHZZRSB0Z6ZwL2SVgDjqyzly8C/pdf3DvDFihpfVHFw/Y72Xneq+/y07e0oduV8jWKE0fp97Qn8TsVpwtsBP0672NoVEf+xiS6532F2Wy3HujbhRopdnS3v/SN8cNejlcCX+7YOJ+kC0sHfWtdi2w5JO0bEm2l08ghwUESsrnVd2zKPIMxsa3FnOrW2F8Wo1OFQMo8grKYkzefDf6j8Q0QszfXfwm1cDhzcqvnnEXFtB61/DsX3ASqdHRH35vp3dZLOoPiOSKW5EfHNXH/bdjkgzMwsy2cxmZlZlgPCzMyyHBBmZpblgDAzsywHhJmZZf1/ghDaRNNOWUIAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xac33320>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "%matplotlib inline\n",
    "sns.countplot(x = 'Triceps_skin_fold_thickness_Missing', hue = 'Target', data = diabetes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0xae089e8>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAELCAYAAADDZxFQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAFWVJREFUeJzt3XuwnHWd5/H311wmXLJAQmAIB0hAYCYRk8Ah6CquAjXEOHLxwpBaFQQJM4suTg3UMK4FMWN2nBoVZZhhC5AlzCKX9QKBTQUwNZjSFUPAGEIYBIUlh1tCxAgTAuTw3T/6OdCEX5JOOM/pPjnvV1VX9/Pr3+/3fLvr1Pn0c+mnIzORJGlz72h3AZKkzmRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklQ0vN0FvB177713Tpgwod1lSNKgct999z2XmeO21W9QB8SECRNYtmxZu8uQpEElIv5fK/3cxSRJKjIgJElFBoQkqWhQH4OQpHZ59dVX6enpYePGje0uZYtGjRpFV1cXI0aM2KHxBoQk7YCenh5Gjx7NhAkTiIh2l/MWmcm6devo6elh4sSJOzSHu5gkaQds3LiRsWPHdmQ4AEQEY8eOfVtbOAaEJO2gTg2HPm+3PgNCklTkMQhJ6gfr1q3j+OOPB+CZZ55h2LBhjBvX+LLy0qVLGTlyZL+v8/7772fNmjXMmDGj3+cGA4KjLryu3SV0jPv+4TPtLkEatMaOHcvy5csBmDNnDrvvvjsXXHBBy+N7e3sZNmzYdq3z/vvvZ+XKlbUFhLuYJKlmH/3oRznqqKOYPHkyV199NQCbNm1izz335Mtf/jLTp09n6dKlLFiwgMMPP5xjjz2WL3zhC5xyyikAvPjii5x55plMnz6dadOmcdttt/HSSy8xd+5crr/+eqZOncr3vve9fq97yG9BSFLd5s+fz5gxY9iwYQPd3d18/OMfZ/To0axfv54jjzySr371q2zYsIHDDjuMn/70pxx44IGcdtppr4+fO3cuM2bM4Nprr+X555/nmGOOYcWKFVx88cWsXLmSb33rW7XU7RaEJNXs0ksvZcqUKbz3ve+lp6eHX//61wCMHDmSU089FYBVq1Zx+OGHc9BBBxERzJo16/Xxd955J/PmzWPq1Kl86EMfYuPGjTzxxBO11+0WhCTV6Ec/+hFLlizhnnvuYZddduH973//699N2GWXXV4/FTUztzhHZnLLLbdwyCGHvKl9yZIl9RWOWxCSVKv169czZswYdtllFx588EHuvffeYr/Jkyfz8MMPs3r1ajKTm2666fXnTjzxRC677LLXl3/xi18AMHr0aF544YXaajcgJKlGH/nIR9iwYQNTpkxh7ty5HHPMMcV+u+66K5dffjknnHACxx57LOPHj2ePPfYA4JJLLmHDhg0cccQRTJ48mTlz5gBw3HHH8ctf/pJp06YNroPUEXEAcB3wh8BrwJWZ+e2ImAOcA6ytun4pMxdWY/4GOBvoBf5rZt5RV32SVJe+f+DQuGDeHXeU/5X97ne/e9PyCSecwMMPP0xmcu6559Ld3Q3AbrvtxlVXXfWW8ePGjav1R9PqPAaxCfirzLw/IkYD90XEXdVzl2bm15s7R8Qk4HRgMjAe+FFEHJaZvTXWKEkd44orruD666/n5Zdfpru7m3POOaet9dQWEJn5NPB09fiFiHgI2H8rQ04GbszMl4HHIuJRYDrws7pqlKROcuGFF3LhhRe2u4zXDcgxiIiYAEwDfl41fT4iVkTENRGxV9W2P7C6aVgPWw8USVKNag+IiNgd+D7wxcz8PXAFcAgwlcYWxjf6uhaGv+W8r4iYHRHLImLZ2rVrC0MkSf2h1oCIiBE0wuH6zPwBQGY+m5m9mfkacBWN3UjQ2GI4oGl4F/DU5nNm5pWZ2Z2Z3X0XwpIk9b/aAiIa3/74DvBQZn6zqX2/pm6nAiurxwuA0yPiDyJiInAosLSu+iRJW1fnWUzvAz4NPBARy6u2LwGzImIqjd1HjwPnAmTmgxFxM7CKxhlQ53kGk6TBor+vDN3K1ZUXLVrE+eefT29vL5/73Oe46KKL+rWGOs9i+gnl4woLtzJmHjCvrpokaWfR29vLeeedx1133UVXVxdHH300J510EpMmTeq3dfhNakkahJYuXco73/lODj74YEaOHMnpp5/Orbfe2q/rMCAkaRB68sknOeCAN87r6erq4sknn+zXdRgQkjQIla7+2ndl2P5iQEjSINTV1cXq1W98t7inp4fx48f36zoMCEkahI4++mgeeeQRHnvsMV555RVuvPFGTjrppH5dhz8YJEn9oJXTUvvT8OHDufzyyznxxBPp7e3lrLPOYvLkyf27jn6dTVK/6e/z6gezgf7nO1jMnDmTmTNn1ja/u5gkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSijzNVZL6wRNzj+jX+Q68+IFt9jnrrLO4/fbb2WeffVi5cuU2+28vtyAkaZA688wzWbRoUW3zGxCSNEh94AMfYMyYMbXNb0BIkooMCElSkQEhSSoyICRJRZ7mKkn9oJXTUvvbrFmzuPvuu3nuuefo6uriK1/5CmeffXa/zW9ASNIgdcMNN9Q6v7uYJElFBoQkqciAkKQdlJntLmGr3m59BoQk7YBRo0axbt26jg2JzGTdunWMGjVqh+fwILUk7YCuri56enpYu3Ztu0vZolGjRtHV1bXD4w0ISdoBI0aMYOLEie0uo1buYpIkFRkQkqSi2gIiIg6IiH+NiIci4sGIOL9qHxMRd0XEI9X9XlV7RMRlEfFoRKyIiCPrqk2StG11bkFsAv4qM/8YeA9wXkRMAi4CFmfmocDiahngw8Ch1W02cEWNtUmStqG2gMjMpzPz/urxC8BDwP7AycD8qtt84JTq8cnAddlwD7BnROxXV32SpK0bkGMQETEBmAb8HNg3M5+GRogA+1Td9gdWNw3rqdokSW1Qe0BExO7A94EvZubvt9a10PaWb6BExOyIWBYRyzr5/GNJGuxqDYiIGEEjHK7PzB9Uzc/27Tqq7tdU7T3AAU3Du4CnNp8zM6/MzO7M7B43blx9xUvSEFfnWUwBfAd4KDO/2fTUAuCM6vEZwK1N7Z+pzmZ6D7C+b1eUJGng1flN6vcBnwYeiIjlVduXgK8BN0fE2cATwCer5xYCM4FHgQ3AZ2usTZK0DbUFRGb+hPJxBYDjC/0TOK+ueiRJ28dvUkuSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQV1RYQEXFNRKyJiJVNbXMi4smIWF7dZjY99zcR8WhEPBwRJ9ZVlySpNS0FREQsbqVtM9cCMwrtl2bm1Oq2sJprEnA6MLka888RMayV2iRJ9dhqQETEqIgYA+wdEXtFxJjqNgEYv7WxmbkE+G2LdZwM3JiZL2fmY8CjwPQWx0qSarCtLYhzgfuAP6ru+263Av+0g+v8fESsqHZB7VW17Q+sburTU7VJktpkqwGRmd/OzInABZl5cGZOrG5TMvPyHVjfFcAhwFTgaeAbVXuUVl+aICJmR8SyiFi2du3aHShBktSK4a10ysx/jIj/CExoHpOZ123PyjLz2b7HEXEVcHu12AMc0NS1C3hqC3NcCVwJ0N3dXQwRSdLb11JARMS/0PjkvxzorZoT2K6AiIj9MvPpavFUoO8MpwXAdyPimzSObRwKLN2euSVJ/aulgAC6gUmZ2fIn9oi4AfggjQPcPcAlwAcjYiqNcHmcxjEOMvPBiLgZWAVsAs7LzN7SvJKkgdFqQKwE/pDGcYOWZOasQvN3ttJ/HjCv1fklSfVqNSD2BlZFxFLg5b7GzDyplqokSW3XakDMqbMISVLnafUsph/XXYgkqbO0ehbTC7zxvYSRwAjg3zPzP9RVmCSpvVrdghjdvBwRp+ClMCRpp9bqMYg3ycxbIuKi/i5GkkqemHtEu0voGAde/MCAravVXUwfa1p8B43vRfgtZknaibW6BfHRpsebaHzJ7eR+r0aS1DFaPQbx2boLkSR1llZ/MKgrIn5Y/ULcsxHx/Yjoqrs4SVL7tPqTo/+TxgX1xtP4nYbbqjZJ0k6q1WMQ4zKzORCujYgv1lGQ2sczRd4wkGeKSJ2q1S2I5yLiUxExrLp9ClhXZ2GSpPZqNSDOAk4DnqFxRddPAB64lqSdWKu7mP4WOCMznweIiDHA12kEhyRpJ9TqFsS7+8IBIDN/C0yrpyRJUidoNSDeERF79S1UWxA7dJkOSdLg0Oo/+W8A/zcivkfjEhun4a+/SdJOrdVvUl8XEcuA44AAPpaZq2qtTJLUVi3vJqoCwVCQpCGi1WMQkqQhxoCQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpKLaAiIiromINRGxsqltTETcFRGPVPd7Ve0REZdFxKMRsSIijqyrLklSa+rcgrgWmLFZ20XA4sw8FFhcLQN8GDi0us0GrqixLklSC2oLiMxcAvx2s+aTgfnV4/nAKU3t12XDPcCeEbFfXbVJkrZtoI9B7JuZTwNU9/tU7fsDq5v69VRtbxERsyNiWUQsW7t2ba3FStJQ1ikHqaPQlqWOmXllZnZnZve4ceNqLkuShq6BDohn+3YdVfdrqvYe4ICmfl3AUwNcmySpyUAHxALgjOrxGcCtTe2fqc5meg+wvm9XlCSpPVr+ydHtFRE3AB8E9o6IHuAS4GvAzRFxNvAE8Mmq+0JgJvAosAH4bF11SZJaU1tAZOasLTx1fKFvAufVVYskaft1ykFqSVKHMSAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklQ0vB0rjYjHgReAXmBTZnZHxBjgJmAC8DhwWmY+3476JEnt3YL4UGZOzczuavkiYHFmHgosrpYlSW3SSbuYTgbmV4/nA6e0sRZJGvLaFRAJ3BkR90XE7Kpt38x8GqC636dNtUmSaNMxCOB9mflUROwD3BUR/9bqwCpQZgMceOCBddUnSUNeW7YgMvOp6n4N8ENgOvBsROwHUN2v2cLYKzOzOzO7x40bN1AlS9KQM+ABERG7RcTovsfAnwArgQXAGVW3M4BbB7o2SdIb2rGLaV/ghxHRt/7vZuaiiLgXuDkizgaeAD7ZhtokSZUBD4jM/A0wpdC+Djh+oOuRJJV10mmukqQOYkBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKmo4wIiImZExMMR8WhEXNTueiRpqOqogIiIYcA/AR8GJgGzImJSe6uSpKGpowICmA48mpm/ycxXgBuBk9tckyQNSZ0WEPsDq5uWe6o2SdIAG97uAjYThbZ8U4eI2cDsavHFiHi49qqGiINgb+C5dtfRES4p/SmqXfzbbNI/f5sHtdKp0wKiBzigabkLeKq5Q2ZeCVw5kEUNFRGxLDO7212HtDn/Ntuj03Yx3QscGhETI2IkcDqwoM01SdKQ1FFbEJm5KSI+D9wBDAOuycwH21yWJA1JHRUQAJm5EFjY7jqGKHfdqVP5t9kGkZnb7iVJGnI67RiEJKlDGBDy8ibqWBFxTUSsiYiV7a5lKDIghjgvb6IOdy0wo91FDFUGhLy8iTpWZi4BftvuOoYqA0Je3kRSkQGhbV7eRNLQZEBom5c3kTQ0GRDy8iaSigyIIS4zNwF9lzd5CLjZy5uoU0TEDcDPgMMjoicizm53TUOJ36SWJBW5BSFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIlYj484j4TD/PeW1EfKJ6fPWOXCk3IuZEREbEO5va/rJq666WF0bEnts5b7+/Xu1cOu4nR6VtiYjh1Rf8+lVm/o/+nnOz+T/3NoY/QONb7l+tlj8BrGqae+YO1FPr69Xg5xaE2iYidouI/xMRv4yIlRHxZxFxVET8OCLui4g7ImK/qu/dEfHfI+LHwPnNn8yr51+s7j9Yjb85In4VEV+LiP8cEUsj4oGIOGQr9cyJiAua1vf31bhfRcSxVfvkqm15RKyIiEMjYkLzD9pExAURMacw/91Nn/hfjIh51Wu/JyL23cbbdQvVZdgj4mBgPbC2ae7HI2Lv0ntaPf+1iFhV1fz17Xi9u1bv5YqIuCkift73GrTzMyDUTjOApzJzSma+C1gE/CPwicw8CrgGmNfUf8/M/E+Z+Y1tzDsFOB84Avg0cFhmTgeuBr6wHfUNr8Z9Ebikavtz4NuZORXopnGxwx2xG3BPZk4BlgDnbKP/74HVEfEuYBZw0xb6veU9jYgxwKnA5Mx8N29shWyu9Hr/C/B8Ne5vgaNae3naGRgQaqcHgBOqT67H0riq7LuAuyJiOfBlGleX7bOlf4qbuzczn87Ml4FfA3c2rW/CdtT3g+r+vqZxPwO+FBF/DRyUmS9tx3zNXgFuL8y/NTfS2M10CvDDLfR503uametphMtG4OqI+BiwYQtjS6/3/dV6ycyVwIoW6tROwoBQ22Tmr2h8In0A+Dvg48CDmTm1uh2RmX/SNOTfmx5vovr7jYgARjY993LT49eall9j+4679Y3r7RuXmd8FTgJeAu6IiOOaa6mMamHuV/ONC6G9Pv823EZji+iJzPx9qcPm72lEXFwdr5kOfJ9GuCzawvxveb2Ufy9EQ4QBobaJiPHAhsz8X8DXgWOAcRHx3ur5ERExeQvDH+eN3R0nAyNqLpeqpoOB32TmZTQui/5u4Flgn4gYGxF/APxpHeuutlb+mjfvdtu8vs3f0yMjYndgj8xcSGP30dTtWO1PgNOquSfR2G2nIcKzmNRORwD/EBGvAa8Cf0Hj0/hlEbEHjb/PbwGly49fBdwaEUuBxbx566JOfwZ8KiJeBZ4B5mbmqxExF/g58Bjwb3WtPDNv3EaX0ns6msZ7NYrGFsFfbscq/xmYHxErgF/Q2MW0frsL16Dk5b4lbVFEDANGZObG6gywxTQO+r/S5tI0ANyCkLQ1uwL/GhEjaGx9/IXhMHS4BaEhJyL+G/DJzZr/d2Zucd/+QOjUujR0GRCSpCLPYpIkFRkQkqQiA0KSVGRASJKKDAhJUtH/BzWoxajWghiZAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x84bdf98>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#缺失值比较多，新增一个新的字段，表明是缺失值还是不是缺失值\n",
    "diabetes['serum_insulin_Missing'] = diabetes['serum_insulin'].apply(lambda x: 1 if pd.isnull(x) else 0)\n",
    "sns.countplot(x = 'serum_insulin_Missing', hue = 'Target', data = diabetes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "特征是否缺失和目标也没什么关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pregnants                       0\n",
      "Plasma_glucose_concentration    0\n",
      "blood_pressure                  0\n",
      "Triceps_skin_fold_thickness     0\n",
      "serum_insulin                   0\n",
      "BMI                             0\n",
      "Diabetes_pedigree_function      0\n",
      "Age                             0\n",
      "Target                          0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#删除新增项\n",
    "diabetes.drop(['Triceps_skin_fold_thickness_Missing','serum_insulin_Missing'],axis = 1, inplace = True)\n",
    "\n",
    "#用中值填补\n",
    "medians = diabetes.median()\n",
    "diabetes = diabetes.fillna(medians)\n",
    "\n",
    "print(diabetes.isnull().sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  get labels\n",
    "y_diabetes = diabetes['Target']\n",
    "X_diabetes = diabetes.drop(['Target'], axis = 1)\n",
    "\n",
    "#用于保存特征工程之后的结果\n",
    "feat_names = X_diabetes.columns\n",
    "\n",
    "#数据标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# 初始化特征的标准化器\n",
    "ss_X = StandardScaler()\n",
    "\n",
    "#分别对训练和测试数据的特征进行标准化处理\n",
    "X_diabetes = ss_X.fit_transform(X_diabetes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对log数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "ms_log = MinMaxScaler()\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_log = X_log.columns\n",
    "\n",
    "# 用训练模型训练好的缩放器对测试数据进行特征缩放：transform\n",
    "X_log =ms_log.fit_transform(X_log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对tf-idf数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "ms_tfidf = MinMaxScaler()\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_tfidf = X_tfidf.columns\n",
    "\n",
    "# 用训练模型训练好的缩放器对测试数据进行特征缩放：transform\n",
    "X_tfidf = ms_tfidf.fit_transform(X_tfidf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征处理结果存为文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存原始特征\n",
    "X_diabetes = pd.DataFrame(columns = feat_names, data = X_diabetes)\n",
    "\n",
    "diabetes = pd.concat([X_diabetes, y_diabetes], axis = 1)\n",
    "\n",
    "diabetes.to_csv('FE_pima_indians_diabetes.csv', index = False, header = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>0.866045</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>0.670643</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.166619</td>\n",
       "      <td>0.468492</td>\n",
       "      <td>1.425995</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.205066</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.852200</td>\n",
       "      <td>-0.365061</td>\n",
       "      <td>-0.190672</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>2.016662</td>\n",
       "      <td>-0.693761</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-1.332500</td>\n",
       "      <td>0.604397</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.073567</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>-0.695245</td>\n",
       "      <td>-0.540642</td>\n",
       "      <td>-0.633881</td>\n",
       "      <td>-0.920763</td>\n",
       "      <td>-1.041549</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>0.504422</td>\n",
       "      <td>-2.679076</td>\n",
       "      <td>0.670643</td>\n",
       "      <td>0.316566</td>\n",
       "      <td>1.549303</td>\n",
       "      <td>5.484909</td>\n",
       "      <td>-0.020496</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0   0.639947                      0.866045       -0.031990   \n",
       "1  -0.844885                     -1.205066       -0.528319   \n",
       "2   1.233880                      2.016662       -0.693761   \n",
       "3  -0.844885                     -1.073567       -0.528319   \n",
       "4  -1.141852                      0.504422       -2.679076   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin       BMI  \\\n",
       "0                     0.670643      -0.181541  0.166619   \n",
       "1                    -0.012301      -0.181541 -0.852200   \n",
       "2                    -0.012301      -0.181541 -1.332500   \n",
       "3                    -0.695245      -0.540642 -0.633881   \n",
       "4                     0.670643       0.316566  1.549303   \n",
       "\n",
       "   Diabetes_pedigree_function       Age  Target  \n",
       "0                    0.468492  1.425995       1  \n",
       "1                   -0.365061 -0.190672       0  \n",
       "2                    0.604397 -0.105584       1  \n",
       "3                   -0.920763 -1.041549       0  \n",
       "4                    5.484909 -0.020496       1  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diabetes.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存log特征变换结果\n",
    "y = pd.Series(data = y_diabetes, name = 'Target')\n",
    "test_log = pd.concat([pd.DataFrame(columns = feat_names_log, data = X_log),y], axis = 1)\n",
    "test_log.to_csv('FE_diabetes_log.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存tf-idf特征变换结果\n",
    "y = pd.Series(data = y_diabetes, name = 'Target')\n",
    "test_tfidf = pd.concat([pd.DataFrame(columns = feat_names_tfidf, data = X_tfidf),y], axis = 1)\n",
    "test_tfidf.to_csv('FE_diabetes_tfidf.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
